import os
import time
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.manifold import TSNE
import bokeh.plotting as bp
from bokeh.plotting import save
from bokeh.models import HoverTool
n_iter = 500
n_top_words = 5
threshold = 0.0
remove = ('headers', 'footers', 'quotes')
newsgroups = fetch_20newsgroups(subset='all', remove=remove)
# newsgroups_test = fetch_20newsgroups(subset='test', remove=remove)
corpus_raw = [' '.join(filter(str.isalpha, raw.lower().split())) for raw in
newsgroups.data]
# print(newsgroups_train.data)
print("Before:\n", newsgroups.data[0])
print("After:\n", corpus_raw[0])
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
stopwords = set(stopwords.words('english'))
punctuation = set(string.punctuation)
lemmatize = WordNetLemmatizer()
def cleaning(article):
one = " ".join([i for i in article.split() if i not in stopwords])
two = "".join(i for i in one if i not in punctuation)
three = " ".join(lemmatize.lemmatize(i) for i in two.split())
four = three.split(" ")
return four
corpus_tokenized = [cleaning(doc) for doc in corpus_raw]
print(corpus_tokenized[0])
from time import time
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO,
filename='running.log',filemode='w')
# Importing Gensim
import gensim
from gensim import corpora
from sklearn.feature_extraction.text import CountVectorizer
# Creating the term dictionary of our corpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
dictionary = corpora.Dictionary(corpus_tokenized)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus_tokenized]
print("Len of raw corpus: %i | Len of matrix: %i" % (len(corpus_raw), len(doc_term_matrix)))
print("Processed:\n", doc_term_matrix[0])
from gensim.models.ldamodel import LdaModel
start = time()
# Creating the object for LDA model using gensim library
# Lda = gensim.models.ldamodel.LdaModel
# Get topics
num_topics = len(newsgroups.target_names)
print(num_topics)
# Running and Trainign LDA model on the document term matrix.
ldamodel = LdaModel(doc_term_matrix, num_topics=num_topics, id2word = dictionary, passes=50)
print('used: {:.2f}s'.format(time()-start))
ldamodel.save('topic.model')
print("Model Saved")
# Loads saved model
from gensim.models import LdaModel
loaded_model = LdaModel.load('topic.model')
print(loaded_model.print_topics(num_topics=2, num_words=4))
# Topics
for i in ldamodel.print_topics():
for j in i: print(j)
# Lets test it out
remove = ('headers', 'footers', 'quotes')
newsgroups_test = fetch_20newsgroups(subset='all', remove=remove)
# newsgroups_test = fetch_20newsgroups(subset='test', remove=remove)
corpus_raw_test = [' '.join(filter(str.isalpha, raw.lower().split())) for raw in
newsgroups_test.data]
print("Original Sentence:\n", newsgroups_test.data[0])
corpus_tokenized_test = [cleaning(doc) for doc in corpus_raw_test]
doc_term_matrix_test = [dictionary.doc2bow(doc) for doc in corpus_tokenized_test]
print("\nAfter processing:\n", doc_term_matrix_test[0])
test_output = loaded_model[doc_term_matrix_test[100]]
print("Output:\n", test_output)
for i in test_output:
print(i)
# Put probabilites into vectors
prob_matrix = np.zeros((len(doc_term_matrix_test), num_topics))
for i, doc in enumerate(doc_term_matrix_test):
predictions = loaded_model[doc]
idx, prob = zip(*predictions)
prob_matrix[i, idx] = prob
# t-SNE
_idx = np.amax(prob_matrix, axis=1) > threshold # idx of news that > threshold
_topics = prob_matrix[_idx]
num_example = len(_topics)
# t-SNE: 50 -> 2D
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99,
init='pca')
tsne_lda = tsne_model.fit_transform(_topics[:num_example])
# find the most probable topic for each news
_lda_keys = []
for i in range(_topics.shape[0]):
_lda_keys += _topics[i].argmax(),
# show topics and their top words
topic_summaries = []
for i in range(num_topics):
word, _ = zip(*loaded_model.show_topic(i, topn=n_top_words))
topic_summaries.append(' '.join(word))
# 20 colors
colormap = np.array([
"#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
"#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
"#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
"#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])
title = "[20 newsgroups] t-SNE visualization of LDA model trained on {} news, " \
"{} topics, thresholding at {} topic probability, {} iter ({} data " \
"points and top {} words)".format(
prob_matrix.shape[0], num_topics, threshold, n_iter, num_example, n_top_words)
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource, CDSView
from bokeh.io import output_notebook
output_notebook()
p = bp.figure(plot_width=1400, plot_height=1100,
title=title,
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
source = ColumnDataSource(data=dict(
x=tsne_lda[:,0],
y=tsne_lda[:, 1],
color=colormap[_lda_keys][:num_example],
content=corpus_raw_test[:num_example],
topic_key=_lda_keys[:num_example]
)
)
p.scatter(x='x', y='y', color='color', source=source)
topic_coord = np.empty((prob_matrix.shape[1], 2)) * np.nan
for topic_num in _lda_keys:
if not np.isnan(topic_coord).any():
break
topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)]
# plot crucial words
for i in range(prob_matrix.shape[1]):
p.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]])
# hover tools
hover = p.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content - topic: @topic_key"}
# p.scatter(x=tsne_lda[:,0], y=tsne_lda[:, 1], color=colormap[_lda_keys][:num_example])
show(p)